library(readr)
library(tuber)

#directory <- "data/"
#
#terms <- c(
#  "R Programming", 
#  "Beginner R Programming",
#  "Intermediate R Programming",
#  "Advanced R Programming"
#)
#
#numTerms <- length(terms)
#
## Grab general info from search endpoint for each term
#for (x in 1:numTerms) {
#  term <- terms[x]
#  file <- paste(term, ".csv", sep = "")
#  
#  filePath <- paste(directory, file, sep = "")
#
#  videos <- yt_search(term)
#  videos <- videos[, c(1,2,3,4,5,6,15)]
#  # [1] "video_id"                  "publishedAt"               "channelId"                
#  # [4] "title"                     "description"               "thumbnails.default.url"   
#  # [7] "thumbnails.default.width"  "thumbnails.default.height" "thumbnails.medium.url"    
#  #[10] "thumbnails.medium.width"   "thumbnails.medium.height"  "thumbnails.high.url"      
#  #[13] "thumbnails.high.width"     "thumbnails.high.height"    "channelTitle"             
#  #[16] "liveBroadcastContent"      "publishTime"
#
#  videos <- videos %>%
#    add_column(viewCount = NA,
#               likeCount = NA,
#               dislikeCount = NA,
#               favoriteCount = NA,
#               commentCount = NA,
#               tags = NA)
#
#  videos
#  write_csv(videos, filePath)
#}
library(readr)
library(tuber)
library(dplyr)
library(lubridate)

#directory <- "data/"
#
#files <- c(
#  "R Programming.csv", 
#  "Beginner R Programming.csv",
#  "Intermediate R Programming.csv",
#  "Advanced R Programming.csv"
#)
#
#numFiles <- length(files)
#
## Get details for top 100 results per search (sorted by relevancy)
#for (x in 1:numFiles) {
#  file <- files[x]
#  filePath <- paste(directory, file, sep = "")
#  
#  videos <- read_csv(filePath)
#  
#  videos <- videos %>%
#    add_column(viewCount = NA,
#               likeCount = NA,
#               dislikeCount = NA,
#               favoriteCount = NA,
#               commentCount = NA,
#               tags = NA)
#  
#  numVideos <- nrow(videos)
#  maxRelevant <- 100
#  
#  for (y in 1:maxRelevant) {
#    videoId <- videos[[y,1]]
#    channelId <- videos[[y,3]]
#  
#    
#    
#    stats <- get_stats(videoId)
#    videos[y, 8] <- if ("viewCount" %in% names(stats) && !is.null(stats[["viewCount"]])) as.double(stats$viewCount) else 0
#    videos[y, 9] <- if ("likeCount" %in% names(stats) && !is.null(stats[["likeCount"]])) as.double(stats$likeCount) else 0
#    videos[y, 10] <- if ("dislikeCount" %in% names(stats) && !is.null(stats[["dislikeCount"]])) as.double(stats$dislikeCount) else 0
#    videos[y, 11] <- if ("favoriteCount" %in% names(stats) && !is.null(stats[["favoriteCount"]])) as.double(stats$favoriteCount) else 0
#    videos[y, 12] <- if ("commentCount" %in% names(stats) && !is.null(stats[["commentCount"]])) as.double(stats$commentCount) else 0
#    
#    
#    
#    details <- get_video_details(videoId)
#    items <- details$items[[1]]
#    snippet <- items$snippet
#    
#    tags <- if ("tags" %in% names(snippet) && !is.null(snippet[["tags"]])) snippet$tags else c()
#    numTags <- length(tags)
#    tagConcat <- ""
#      
#    for (z in 1:numTags) {
#      tagConcat <- paste(tagConcat, tags[[z]], sep = if (z == 1) "" else ",")
#    }
#    
#    videos[y, 2] <- round_date(videos[y, 2], "day")
#    videos[y, 5] <- if ("description" %in% names(snippet) && !is.null(snippet[["description"]])) snippet$description else ""
#    videos[y, 13] <- tagConcat
#    
#    
#    
#    #captions <- get_captions(videoId)
#    
#    
#    
#    #comments <- get_comment_threads(c(video_id = videoId))
#  }
#}
#
#videos
#write_csv(videos, filePath)
library(readr)
library(dplyr)
library(lubridate)
library(stringr)

directory <- "data/"

rFilePath <- paste(directory, "R Programming.csv", sep = "")
beginnerFilePath <- paste(directory, "Beginner R Programming.csv", sep = "")
intermediateFilePath <- paste(directory, "Intermediate R Programming.csv", sep = "")
advancedFilePath <- paste(directory, "Advanced R Programming.csv", sep = "")

rVideos <- read_csv(rFilePath)
## Rows: 594 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl  (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
beginnerVideos <- read_csv(beginnerFilePath)
## Rows: 597 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl  (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
intermediateVideos <- read_csv(intermediateFilePath)
## Rows: 566 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl  (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
advancedVideos <- read_csv(advancedFilePath)
## Rows: 601 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl  (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## dttm (1): publishedAt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
maxRelevant <- 100

# Dedupe records from generic search and other targeted searches for goo measure
beginnerVideosUnique <- beginnerVideos[1:maxRelevant,] %>%
  setdiff(rVideos) %>%
  setdiff(intermediateVideos) %>%
  setdiff(advancedVideos)
## Warning: One or more parsing issues, see `problems()` for details

## Warning: One or more parsing issues, see `problems()` for details

## Warning: One or more parsing issues, see `problems()` for details

## Warning: One or more parsing issues, see `problems()` for details
intermediateVideosUnique <- intermediateVideos[1:maxRelevant,] %>%
  setdiff(rVideos) %>%
  setdiff(beginnerVideos) %>%
  setdiff(advancedVideos)

advancedVideosUnique <- advancedVideos[1:maxRelevant,] %>%
  setdiff(rVideos) %>%
  setdiff(beginnerVideos) %>%
  setdiff(intermediateVideos)



beginnerVideosUnique <- beginnerVideosUnique %>%
  add_column(level = "beginner",
             rVersion = NA,
             relevant = NA)

intermediateVideosUnique <- intermediateVideosUnique %>%
  add_column(level = "intermediate",
             rVersion = NA,
             relevant = NA)

advancedVideosUnique <- advancedVideosUnique %>%
  add_column(level = "advanced",
             rVersion = NA,
             relevant = NA)



# Check relevancy in deduped according to the appearance of "R" in title, description, or tag
beginnerVideosUnique <- beginnerVideosUnique %>%
  mutate(publishedAt = round_date(beginnerVideosUnique$publishedAt, "day"),
         relevant = str_detect(beginnerVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(beginnerVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(beginnerVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))

intermediateVideosUnique <- intermediateVideosUnique %>%
  mutate(publishedAt = round_date(intermediateVideosUnique$publishedAt, "day"),
         relevant = str_detect(intermediateVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(intermediateVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(intermediateVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))

advancedVideosUnique <- advancedVideosUnique %>%
  mutate(publishedAt = round_date(advancedVideosUnique$publishedAt, "day"),
         relevant = str_detect(advancedVideosUnique$title, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(advancedVideosUnique$tags, regex("\\bR\\b", ignore_case = TRUE))
                    | str_detect(advancedVideosUnique$description, regex("\\bR\\b", ignore_case = TRUE)))



rVersionFile <- "R Versions.csv"
rVersionsPath <- paste(directory, rVersionFile, sep = "")
rVersions <- read_delim(rVersionsPath, trim_ws = TRUE)
## New names:
## * `` -> ...1
## Rows: 89 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rVersions <- rVersions %>%
  add_column(dateReleased = NA)
  
dateSplits <- str_split(rVersions$date, ",")

numDateSplits <- length(dateSplits)

for (x in 1:numDateSplits) {
  rVersions[x,3] <- as.POSIXct(paste(
    dateSplits[[x]][2],
    match(dateSplits[[x]][1], month.name),
    "01",
    sep = "-"
  ), format = "%Y-%m-%d")
}

write_csv(rVersions, paste(directory, rVersionFile, sep = ""))


# Check R version available at time of video release
numVersions <- nrow(rVersions)

numBeginnerVideos <- nrow(beginnerVideosUnique)

for (x in 1:numBeginnerVideos) {
  loops <- 1
  
  while (loops <= numVersions) {
    if (beginnerVideosUnique[x, 2] < rVersions[loops, 3]) {
      loops <- loops + 1
    }
    
    else {
      beginnerVideosUnique[x, 15] <- rVersions[loops, 1]
      break
    }
  }
}

numIntermediateVideos <- nrow(intermediateVideosUnique)

for (x in 1:numIntermediateVideos) {
  loops <- 1
  
  while (loops <= numVersions) {
    if (intermediateVideosUnique[x, 2] < rVersions[loops, 3]) {
      loops <- loops + 1
    }
    
    else {
      intermediateVideosUnique[x, 15] <- rVersions[loops, 1]
      break
    }
  }
}

numAdvancedVideos <- nrow(advancedVideosUnique)

for (x in 1:numAdvancedVideos) {
  loops <- 1
  
  while (loops <= numVersions) {
    if (advancedVideosUnique[x, 2] < rVersions[loops, 3]) {
      loops <- loops + 1
    }
    
    else {
      advancedVideosUnique[x, 15] <- rVersions[loops, 1]
      break
    }
  }
}



uniqueAll <- union(union(beginnerVideosUnique, intermediateVideosUnique), advancedVideosUnique)

write_csv(beginnerVideosUnique, paste(directory, "Beginner R Programming Unique.csv", sep = ""))
write_csv(intermediateVideosUnique, paste(directory, "Intermediate R Programming Unique.csv", sep = ""))
write_csv(advancedVideosUnique, paste(directory, "Advanced R Programming Unique.csv", sep = ""))
write_csv(uniqueAll, paste(directory, "R Programming Unique.csv", sep = ""))
library(readr)
library(ggplot2)
library(runner)
library(stringr)
library("stopwords")
library("wordcloud")
library(ggrepel)
library(psych)

directory <- "data/"

rUniqueFilePath <- paste(directory, "R Programming Unique.csv", sep = "")
rVideosUnique <- read_delim(rUniqueFilePath, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 278 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): video_id, channelId, title, description, thumbnails.default.url, c...
## dbl  (5): viewCount, likeCount, dislikeCount, favoriteCount, commentCount
## lgl  (1): relevant
## dttm (1): publishedAt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rVideosUniqueRelevant <- rVideosUnique %>% 
  filter(relevant == TRUE)

# "Misinformation"
# 66/92 = 71%
beginnerVideosUniqueRelevant <- rVideosUnique %>%
       filter(level == "beginner")

# 71/95 = 75%
intermediateVideosUniqueRelevant <- rVideosUnique %>% 
  filter(level == "intermediate")

# 90/91 = 99%
advancedVideosUniqueRelevant <- rVideosUnique %>% 
  filter(level == "advanced")



# When did the market saturate?
ggplot(beginnerVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Beginner Relevancy by Date") +
  geom_point(aes(colour = factor(relevant)))

ggplot(intermediateVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Intermediate Relevancy by Date") +
  geom_point(aes(colour = factor(relevant)))

ggplot(advancedVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Advanced Relevancy by Date") +
  geom_point(aes(colour = factor(relevant)))

# Assuming most of a video's traffic occurs within the first couple of months, then viewership has increased at a steady rate
# Amount of older videos can also speak to scarcity of newer videos as sorted by relevancy, one might expect new videos to be more relevant
ggplot(rVideosUniqueRelevant %>%
         arrange(publishedAt), aes(publishedAt, sum_run(
           x = viewCount, 
           idx = publishedAt
          ))
  ) +
  ggtitle("Cumulative Views by Date") +
  geom_smooth(method = lm) + 
  geom_point()
## `geom_smooth()` using formula 'y ~ x'

ggplot(rVideosUniqueRelevant, aes(publishedAt, viewCount)) +
  ggtitle("Views by Date") +
  geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant %>%
         filter(viewCount < 1000), aes(publishedAt, viewCount)) +
  ggtitle("Views by Date") +
  geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant, aes(rVersion, viewCount)) +
  geom_point(aes(colour = factor(level))) +
  ggtitle("Views by R version") +
  theme(axis.text.x = element_text(angle = 45))

rVideosUniqueRelevant$viewCount %>%
  describe
##    vars   n    mean       sd median trimmed    mad min     max   range skew
## X1    1 227 36420.6 154786.2    196 5635.71 268.35   0 1814387 1814387 8.38
##    kurtosis       se
## X1    83.93 10273.52
rVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:227         Min.   :2010-02-07 00:00:00   Length:227        
##  Class :character   1st Qu.:2019-06-01 00:00:00   Class :character  
##  Mode  :character   Median :2020-09-17 00:00:00   Mode  :character  
##                     Mean   :2019-08-22 20:05:17                     
##                     3rd Qu.:2021-01-22 00:00:00                     
##                     Max.   :2021-10-30 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:227         Length:227         Length:227            
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount         likeCount        dislikeCount   
##  Length:227         Min.   :      0   Min.   :    0.0   Min.   :  0.00  
##  Class :character   1st Qu.:     37   1st Qu.:    1.0   1st Qu.:  0.00  
##  Mode  :character   Median :    196   Median :    4.0   Median :  0.00  
##                     Mean   :  36421   Mean   :  649.5   Mean   : 10.65  
##                     3rd Qu.:   4324   3rd Qu.:   64.5   3rd Qu.:  1.50  
##                     Max.   :1814387   Max.   :41152.0   Max.   :513.00  
##  favoriteCount  commentCount        tags              level          
##  Min.   :0     Min.   :  0.00   Length:227         Length:227        
##  1st Qu.:0     1st Qu.:  0.00   Class :character   Class :character  
##  Median :0     Median :  0.00   Mode  :character   Mode  :character  
##  Mean   :0     Mean   : 34.89                                        
##  3rd Qu.:0     3rd Qu.:  4.50                                        
##  Max.   :0     Max.   :886.00                                        
##    rVersion         relevant      
##  Length:227         Mode:logical  
##  Class :character   TRUE:227      
##  Mode  :character                 
##                                   
##                                   
## 
beginnerVideosUniqueRelevant$viewCount %>%
  describe
##    vars  n     mean      sd median  trimmed     mad min      max    range skew
## X1    1 92 465864.9 1874922 2635.5 101214.3 3905.17   0 13934381 13934381 6.09
##    kurtosis       se
## X1    37.76 195474.1
beginnerVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:92          Min.   :2013-10-31 00:00:00   Length:92         
##  Class :character   1st Qu.:2019-02-21 12:00:00   Class :character  
##  Mode  :character   Median :2020-04-16 12:00:00   Mode  :character  
##                     Mean   :2019-09-13 10:26:05                     
##                     3rd Qu.:2020-09-22 06:00:00                     
##                     Max.   :2021-10-30 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:92          Length:92          Length:92             
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount          likeCount         dislikeCount    
##  Length:92          Min.   :       0   Min.   :     0.0   Min.   :   0.00  
##  Class :character   1st Qu.:      88   1st Qu.:     1.0   1st Qu.:   0.00  
##  Mode  :character   Median :    2636   Median :    45.5   Median :   1.00  
##                     Mean   :  465865   Mean   : 10781.2   Mean   : 186.73  
##                     3rd Qu.:  163410   3rd Qu.:  3679.8   3rd Qu.:  62.25  
##                     Max.   :13934381   Max.   :336767.0   Max.   :4412.00  
##  favoriteCount  commentCount         tags              level          
##  Min.   :0     Min.   :    0.0   Length:92          Length:92         
##  1st Qu.:0     1st Qu.:    0.0   Class :character   Class :character  
##  Median :0     Median :    3.5   Mode  :character   Mode  :character  
##  Mean   :0     Mean   :  457.0                                        
##  3rd Qu.:0     3rd Qu.:  222.2                                        
##  Max.   :0     Max.   :15001.0                                        
##    rVersion          relevant      
##  Length:92          Mode :logical  
##  Class :character   FALSE:25       
##  Mode  :character   TRUE :66       
##                     NA's :1        
##                                    
## 
intermediateVideosUniqueRelevant$viewCount %>%
  describe
##    vars  n     mean     sd median  trimmed   mad min     max   range skew
## X1    1 95 220691.8 739024    619 35749.96 911.8   0 4617704 4617704 4.41
##    kurtosis       se
## X1    20.27 75822.26
intermediateVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:95          Min.   :2010-02-07 00:00:00   Length:95         
##  Class :character   1st Qu.:2017-11-13 12:00:00   Class :character  
##  Mode  :character   Median :2020-05-05 00:00:00   Mode  :character  
##                     Mean   :2018-09-25 20:27:47                     
##                     3rd Qu.:2021-01-09 12:00:00                     
##                     Max.   :2021-12-02 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:95          Length:95          Length:95             
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount         likeCount      dislikeCount    
##  Length:95          Min.   :      0   Min.   :    0   Min.   :   0.00  
##  Class :character   1st Qu.:     62   1st Qu.:    1   1st Qu.:   0.00  
##  Mode  :character   Median :    619   Median :    6   Median :   0.00  
##                     Mean   : 220692   Mean   : 3954   Mean   :  71.79  
##                     3rd Qu.:  75637   3rd Qu.:  567   3rd Qu.:   6.50  
##                     Max.   :4617704   Max.   :87535   Max.   :1679.00  
##  favoriteCount  commentCount        tags              level          
##  Min.   :0     Min.   :   0.0   Length:95          Length:95         
##  1st Qu.:0     1st Qu.:   0.0   Class :character   Class :character  
##  Median :0     Median :   0.0   Mode  :character   Mode  :character  
##  Mean   :0     Mean   : 108.4                                        
##  3rd Qu.:0     3rd Qu.:  52.5                                        
##  Max.   :0     Max.   :2227.0                                        
##    rVersion          relevant      
##  Length:95          Mode :logical  
##  Class :character   FALSE:19       
##  Mode  :character   TRUE :71       
##                     NA's :5        
##                                    
## 
advancedVideosUniqueRelevant$viewCount %>%
  describe
##    vars  n    mean       sd median trimmed    mad min   max range skew kurtosis
## X1    1 91 4823.18 13816.58     89 1321.68 118.61   0 86035 86035  4.1    17.89
##         se
## X1 1448.37
advancedVideosUniqueRelevant %>%
  summary
##    video_id          publishedAt                   channelId        
##  Length:91          Min.   :2015-11-25 00:00:00   Length:91         
##  Class :character   1st Qu.:2020-07-12 00:00:00   Class :character  
##  Mode  :character   Median :2020-11-05 00:00:00   Mode  :character  
##                     Mean   :2020-06-26 00:15:49                     
##                     3rd Qu.:2021-03-07 00:00:00                     
##                     Max.   :2021-10-28 00:00:00                     
##     title           description        thumbnails.default.url
##  Length:91          Length:91          Length:91             
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  channelTitle         viewCount       likeCount        dislikeCount   
##  Length:91          Min.   :    0   Min.   :   0.00   Min.   : 0.000  
##  Class :character   1st Qu.:   31   1st Qu.:   0.00   1st Qu.: 0.000  
##  Mode  :character   Median :   89   Median :   2.00   Median : 0.000  
##                     Mean   : 4823   Mean   :  92.35   Mean   : 1.604  
##                     3rd Qu.: 1263   3rd Qu.:  17.50   3rd Qu.: 0.000  
##                     Max.   :86035   Max.   :1426.00   Max.   :22.000  
##  favoriteCount  commentCount         tags              level          
##  Min.   :0     Min.   :  0.000   Length:91          Length:91         
##  1st Qu.:0     1st Qu.:  0.000   Class :character   Class :character  
##  Median :0     Median :  0.000   Mode  :character   Mode  :character  
##  Mean   :0     Mean   :  6.681                                        
##  3rd Qu.:0     3rd Qu.:  2.000                                        
##  Max.   :0     Max.   :188.000                                        
##    rVersion          relevant      
##  Length:91          Mode :logical  
##  Class :character   FALSE:1        
##  Mode  :character   TRUE :90       
##                                    
##                                    
## 
ggplot(rVideosUniqueRelevant, aes(level, viewCount)) +
  ggtitle("Views by Difficulty") +
  geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, likeCount)) +
  ggtitle("Likes by Difficulty") +
  geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, dislikeCount)) +
  ggtitle("Dislikes by Difficulty") +
  geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, commentCount)) +
  ggtitle("Comments by Difficulty") +
  geom_boxplot()

rVideosUniqueRelevantChannels <- rVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))
          
ggplot(rVideosUniqueRelevantChannels, aes(n, viewCount)) +
  geom_point() +
  ggtitle("All Channel Videos Count by Views") +
  geom_label_repel(aes(label = channelTitle),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')
## Warning: ggrepel: 91 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

beginnerVideosUniqueRelevantChannels <- beginnerVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))

ggplot(beginnerVideosUniqueRelevantChannels, aes(n, viewCount)) +
    geom_point() +
  ggtitle("Beginner Channel Video Count by Views") +
    geom_label_repel(aes(label = channelTitle),
                     box.padding   = 0.35, 
                     point.padding = 0.5,
                     segment.color = 'grey50')
## Warning: ggrepel: 54 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

intermediateVideosUniqueRelevantChannels <- intermediateVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))

ggplot(intermediateVideosUniqueRelevantChannels, aes(n, viewCount)) +
  geom_point() +
  ggtitle("Intermediate Channel Video Count by Views") +
  geom_label_repel(aes(label = channelTitle),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')
## Warning: ggrepel: 33 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

advancedVideosUniqueRelevantChannels <- advancedVideosUniqueRelevant %>%
  group_by(channelTitle) %>%
  summarise(n = n(), viewCount = sum(viewCount))
                   
ggplot(advancedVideosUniqueRelevantChannels, aes(n, viewCount)) +
  geom_point() +
  ggtitle("Advanced Channel Video Count by Views") +
  geom_label_repel(aes(label = channelTitle),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

maxWords = 100
filterRegex <- ".*[0-9][0-9]+$|^[0-9]*$|.*\\..*"

titleWords <- rVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- beginnerVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- intermediateVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- advancedVideosUniqueRelevant[,4] %>% 
  unnest_tokens(output = word, input = title) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

descriptionWords <- rVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- beginnerVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- intermediateVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- advancedVideosUniqueRelevant[,5] %>% 
  unnest_tokens(output = word, input = description) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

rVideosUniqueRelevant$tags <- gsub(",", " ", rVideosUniqueRelevant$tags)

tagsWords <- rVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

beginnerVideosUniqueRelevant$tags <- gsub(",", " ", beginnerVideosUniqueRelevant$tags)

tagsWords <- beginnerVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

intermediateVideosUniqueRelevant$tags <- gsub(",", " ", intermediateVideosUniqueRelevant$tags)

tagsWords <- intermediateVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

advancedVideosUniqueRelevant$tags <- gsub(",", " ", advancedVideosUniqueRelevant$tags)

tagsWords <- advancedVideosUniqueRelevant[,13] %>% 
  unnest_tokens(output = word, input = tags) %>%
  anti_join(get_stopwords()) %>% 
  filter(!str_detect(word, regex(filterRegex))) %>%
  count(word)
## Joining, by = "word"
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)